In [2]:
import pandas as pd

import numpy as np

from datetime import datetime

import math

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.datasets import make_classification

import plotly.express as px

from urllib.request import urlopen

import json

import matplotlib.pyplot as plt

import warnings
warnings.filterwarnings('ignore')

Main Wildfire Dataset¶

In [3]:
#loading in wildfire data
wildfire_raw = pd.read_csv('California_Fire_Incidents.csv')
In [4]:
# Dropping mostly NaN populated columns and columns with all the same value.

wildfire_raw.drop(['AirTankers', 'ConditionStatement', 'ControlStatement','CrewsInvolved', 'CountyIds', 'Dozers',
                   'Engines', 'Fatalities', 'FuelType', 'Helicopters', 'Injuries', 'PersonnelInvolved',
                   'StructuresEvacuated', 'StructuresDamaged', 'StructuresDestroyed', 'StructuresThreatened',
                   'WaterTenders', 'Active', 'CanonicalUrl', 'Status', 'Updated', 'Final', 'PercentContained',
                   'Public', 'Latitude', 'Longitude', 'Featured', 'Location', 'Name',
                   'SearchDescription', 'SearchKeywords', 'UniqueId','AdminUnit'], axis=1, inplace=True)
In [5]:
wildfire_raw.head()
Out[5]:
AcresBurned ArchiveYear CalFireIncident Counties Extinguished MajorIncident Started
0 257314.0 2013 True Tuolumne 2013-09-06T18:30:00Z False 2013-08-17T15:25:00Z
1 30274.0 2013 True Los Angeles 2013-06-08T18:30:00Z False 2013-05-30T15:28:00Z
2 27531.0 2013 True Riverside 2013-07-30T18:00:00Z False 2013-07-15T13:43:00Z
3 27440.0 2013 False Placer 2013-08-30T08:00:00Z False 2013-08-10T16:30:00Z
4 24251.0 2013 True Ventura 2013-05-11T06:30:00Z True 2013-05-02T07:01:00Z

Data Cleaning¶

Columns to Impute¶

AcresBurned (3 NaN values) -- drop NaN columns

Extinguished (59 NaN values) -- EDA has told us it may be best to drop those rows

Other Columns to Handle¶

Counties (4 incorrect values) -- replace 'Mexico' with 'San Diego,' replace 'State of Oregon' with 'Siskiyou,' replace 'State of Nevada' with 'Nevada' (it's a California County)

In [6]:
# AcresBurned
# manually changing missing records based on quick search of record
# 614 - 1,324 acres
# 1045 - 210 acres
# 1052 - 5000 acres
# 1367 - 100 acres

wildfire_raw.loc[614, 'AcresBurned'] = 1324.0
wildfire_raw.loc[1045, 'AcresBurned'] = 210.0
wildfire_raw.loc[1052, 'AcresBurned'] = 5000.0
wildfire_raw.loc[1367, 'AcresBurned'] = 100.0

wildfire_raw.dropna(subset=['AcresBurned'], inplace=True)
In [7]:
# Counties
wildfire_raw.loc[1423, 'Counties'] = 'Siskiyou'
wildfire_raw.loc[1424, 'Counties'] = 'Nevada'
wildfire_raw.loc[1421, 'Counties'] = 'San Diego'
wildfire_raw.loc[1590, 'Counties'] = 'San Diego'
In [8]:
# Function to convert the Started and Extinguished dates to datetime objects
def to_datetime(date):
  if type(date) != str and math.isnan(date):
    return date
  date_only = str(date)[:10] # ensuring that the date is a string + omitting the time
  return datetime.strptime(date_only, '%Y-%m-%d')
In [9]:
# Extinguished
wildfire_raw['StartedDate'] = wildfire_raw['Started'].apply(to_datetime)
wildfire_raw['ExtinguishedDate'] = wildfire_raw['Extinguished'].apply(to_datetime)

wildfire_raw.loc[1019, 'StartedDate'] = to_datetime('2017-05-19')
wildfire_raw.loc[1261, 'StartedDate'] = to_datetime('2018-08-08')

null_extinguished = wildfire_raw[wildfire_raw['Extinguished'].isna()] # all the rows that need the extinguished column filled
notnull_extinguished = wildfire_raw[wildfire_raw['Extinguished'].notnull()] # model uses this to predict burn duration from acres burned
notnull_extinguished['BurnDuration'] = (wildfire_raw['ExtinguishedDate'] - wildfire_raw['StartedDate']) #timestamp object
notnull_extinguished['BurnDuration'] = [timestamp.days for timestamp in notnull_extinguished['BurnDuration']] #converting to int # days
notnull_extinguished = notnull_extinguished[notnull_extinguished['BurnDuration'] > 0] #getting rid of incorrect extinguished dates

acres_burned, burn_duration = notnull_extinguished['AcresBurned'], notnull_extinguished['BurnDuration']
plt.scatter(acres_burned, burn_duration, alpha=0.25, color='darkorange');
plt.ylim(0, 300);
plt.xlim(0, 4000);
plt.ylabel('Burn Duration (Days)');
plt.xlabel('Burned Area (Acres)');
plt.title('Training Data: Burn Duration vs. Acres Burned')
plt.savefig('burneduration_vs_area')
#this plot is a little concerning for the performance of the model to fill the null burn durations (no notable trends)
In [10]:
# Linear Regression
x_pred = null_extinguished['AcresBurned'].to_numpy().reshape((-1, 1))
x = acres_burned.to_numpy().reshape((-1, 1))
y = burn_duration.to_numpy().reshape((-1, 1))

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.1, random_state=42)
burn_duration_model = LinearRegression()
burn_duration_model.fit(x_train, y_train)

y_test_pred = burn_duration_model.predict(x_test)
plt.hist(y_test_pred - y_test, color='firebrick', edgecolor='black', bins=range(-325, 151, 50))
plt.title('(Predicted Burn Duration - True Burn Duration) Distribution')
plt.xlabel('Difference (# Days)')

#train the model on all of the data, predict missing values:
burn_duration_model.fit(x, y)
y_pred = burn_duration_model.predict(x_pred)
In [11]:
plt.scatter(x_test, y_test, color="darkorange", alpha = 0.25)
plt.plot(x_test, y_test_pred, color="blue")
plt.xlim(0,100000)
plt.xlabel('Burned Area (Acres)');
plt.ylabel('Burn Duration (Days)');
plt.title('Predicted: Burn Duration vs. Acres Burned');
plt.ylim(0, 300);
plt.xlim(0, 4000);
print('Regression Coefficient: ' + str(burn_duration_model.coef_))
Regression Coefficient: [[0.00026496]]

TLDR: burn duration is not strongly related to acres burned, therefore we're dropping the missing extinguished date rows (only 59 rows out of the 1600+)

In [12]:
wildfire_raw.dropna(subset=['Extinguished'], inplace=True)

Feature Engineering: External Data¶

Adding in average population data per county

Adding in square mileage per county

Adding in climate data per county

In [14]:
# population size per county, per year
population_raw = pd.read_excel('co-est2019-annres-06.xlsx', header = 3).rename(columns = {'Unnamed: 0': "County"}).iloc[:59, :]
population_raw['County'] = population_raw['County'].iloc[:].str.extract(r'\.(.*?) County, California')
population_raw = population_raw.iloc[1:, :]
population_raw = population_raw.add_suffix('_population')
population_raw.rename(columns = {'County_population': 'county_name'}, inplace=True)

# averaging population across years
population_raw['mean_population'] = population_raw.iloc[:, 3:].mean(axis=1)

population_raw.drop(['2010_population', '2011_population', '2012_population',
                     '2013_population', '2014_population', '2015_population',
                     '2016_population', '2017_population', '2018_population',
                     '2019_population', 'Census_population', 'Estimates Base_population'], axis=1, inplace=True)

population_raw.head()
Out[14]:
county_name mean_population
1 Alameda 1606881.9
2 Alpine 1102.5
3 Amador 37802.5
4 Butte 223503.3
5 Calaveras 45234.0
In [15]:
# square mileage per county
land_raw = pd.read_excel('land area.xlsx')[['Areaname', 'square miles']]
land_raw = land_raw[land_raw['Areaname'].str.contains(r', CA$')]
ca_land_raw = pd.DataFrame({})
ca_land_raw['county_name'] = land_raw['Areaname'].str.extract(r'^(.+?),')
ca_land_raw['square_miles'] = land_raw['square miles']
ca_land_raw.head()
Out[15]:
county_name square_miles
192 Alameda 83.57
193 Alpine 4.57
194 Amador 11.73
195 Butte 37.62
196 Calaveras 16.81
In [16]:
# merging population and land area data
pop_and_area = population_raw.merge(ca_land_raw, left_on = 'county_name', right_on = 'county_name')
pop_and_area.head()
Out[16]:
county_name mean_population square_miles
0 Alameda 1606881.9 83.57
1 Alpine 1102.5 4.57
2 Amador 37802.5 11.73
3 Butte 223503.3 37.62
4 Calaveras 45234.0 16.81
In [17]:
wildfire_raw = wildfire_raw.merge(pop_and_area, how = 'left', left_on = 'Counties', right_on = 'county_name').drop('county_name', axis=1)
In [18]:
wildfire_raw.head()
Out[18]:
AcresBurned ArchiveYear CalFireIncident Counties Extinguished MajorIncident Started StartedDate ExtinguishedDate mean_population square_miles
0 257314.0 2013 True Tuolumne 2013-09-06T18:30:00Z False 2013-08-17T15:25:00Z 2013-08-17 2013-09-06 54231.5 38.93
1 30274.0 2013 True Los Angeles 2013-06-08T18:30:00Z False 2013-05-30T15:28:00Z 2013-05-30 2013-06-08 10007550.7 691.45
2 27531.0 2013 True Riverside 2013-07-30T18:00:00Z False 2013-07-15T13:43:00Z 2013-07-15 2013-07-30 2335696.3 95.76
3 27440.0 2013 False Placer 2013-08-30T08:00:00Z False 2013-08-10T16:30:00Z 2013-08-10 2013-08-30 372878.5 98.41
4 24251.0 2013 True Ventura 2013-05-11T06:30:00Z True 2013-05-02T07:01:00Z 2013-05-02 2013-05-11 840546.2 362.90
In [22]:
# TODO: average climate data per county
temp = pd.read_csv('california_temp.csv')
temp_2 = temp.pivot_table(values = 'Arithmetic Mean', index = ['County Name', 'year'], columns = 'month')
temp_2.reset_index(inplace = True)
temp_2.drop([11, 12],axis = 1, inplace = True)
temp_2 = temp_2.add_suffix("_temp").sort_values(['County Name_temp', 'year_temp'])
#fill na values for temp
temp_2 = temp_2.interpolate()
temp_2 = temp_2.bfill()

wind = pd.read_csv('california_wind.csv')
wind_2 = wind.pivot_table(values = 'Arithmetic Mean', index = ['County Name', 'year'], columns = 'month')
wind_2.reset_index(inplace = True)
wind_2.drop([11, 12], axis = 1, inplace = True)
wind_2 = wind_2.add_suffix("_wind")
#fill na values for wind
wind_2 = wind_2.interpolate()

rh = pd.read_csv('california_rh.csv')
rh_2 = rh.pivot_table(values = 'Arithmetic Mean', index = ['County Name', 'year'], columns = 'month')
rh_2.reset_index(inplace = True)
rh_2.drop([11, 12], axis = 1, inplace = True)
rh_2 = rh_2.add_suffix("_rh")
#fill na values for relative humidity
rh_2 = rh_2.interpolate()
rh_2 = rh_2.bfill()
In [23]:
#join wildifre data and temp
wildfire = wildfire_raw.merge(temp_2, how = 'left', left_on = ['ArchiveYear', 'Counties'], right_on = ['year_temp', 'County Name_temp'])
#join wildifre data and wind
wildfire = wildfire.merge(wind_2, how = 'left', left_on = ['ArchiveYear', 'Counties'], right_on = ['year_wind', 'County Name_wind'])
#join wildifre data and rh
wildfire = wildfire.merge(rh_2, how = 'left', left_on = ['ArchiveYear', 'Counties'], right_on = ['year_rh', 'County Name_rh'])
In [24]:
wildfire.drop(['County Name_temp', 'year_temp', 'County Name_wind', 'year_wind', 'County Name_rh', 'year_rh', 'Extinguished', 'Started'],
              inplace=True, axis=1)
In [25]:
wildfire = wildfire.sort_values(['Counties', 'ArchiveYear'])
In [26]:
#back fill na
wildfire = wildfire.bfill()
In [27]:
#forward fill na
wildfire = wildfire.ffill()

Geospatial County-wise EDA¶

In [29]:
with urlopen('https://raw.githubusercontent.com/plotly/datasets/master/geojson-counties-fips.json') as response:
    counties = json.load(response)

fips_to_state = pd.read_csv('https://raw.githubusercontent.com/plotly/datasets/master/minoritymajority.csv', dtype={"FIPS": str})
fips_to_ca = fips_to_state[fips_to_state['STNAME'] == 'California'][['FIPS', 'CTYNAME']]
fips_to_ca['CTYNAME'] = fips_to_ca['CTYNAME'].str.extract(r'(.*?) County')
In [31]:
geo_df0 = wildfire[['Counties']].merge(fips_to_ca, left_on='Counties', right_on = 'CTYNAME').groupby(['Counties', 'FIPS']).count().rename(mapper = {'CTYNAME': 'count'}, axis = 1).reset_index()


fig = px.choropleth(geo_df0, geojson = counties, locations = 'FIPS', color = 'count', scope='usa',
                    color_continuous_scale="Hot", hover_data = ['Counties'],
                    labels={'count':'Total Fires'})
fig.update_layout(margin={"r":0,"t":0,"l":0,"b":0})
fig.update_geos(fitbounds="locations", visible=False)
fig.show()
In [32]:
geo_df00 = wildfire[['Counties', 'AcresBurned']].groupby('Counties').sum().reset_index()
geo_df00 = geo_df00.merge(fips_to_ca, left_on='Counties', right_on = 'CTYNAME').drop('CTYNAME', axis = 1)


fig = px.choropleth(geo_df00, geojson = counties, locations = 'FIPS', color = 'AcresBurned', scope='usa',
                    color_continuous_scale="Hot", hover_data = ['Counties', 'AcresBurned'],
                    labels={'AcresBurned':'Total Acres Burned'})
fig.update_layout(margin={"r":0,"t":0,"l":0,"b":0})
fig.update_geos(fitbounds="locations", visible=False)
fig.show()
In [33]:
geo_df000 = geo_df0.merge(geo_df00, on = ['FIPS', 'Counties'])
geo_df000['AvgAcresPerFire'] = geo_df000['AcresBurned']/geo_df000['count']


fig = px.choropleth(geo_df000, geojson = counties, locations = 'FIPS', color = 'AvgAcresPerFire', scope='usa',
                    color_continuous_scale="Hot", hover_data = ['Counties', 'AcresBurned', 'count'],
                    labels={'AvgAcresPerFire':'Average Acres Burned per Fire'})
fig.update_layout(margin={"r":0,"t":0,"l":0,"b":0})
fig.update_geos(fitbounds="locations", visible=False)
fig.show()
In [34]:
geo_df1 = wildfire[['Counties', 'mean_population']]
geo_df1 = geo_df1.merge(fips_to_ca, left_on='Counties', right_on = 'CTYNAME').drop('CTYNAME', axis = 1)

fig = px.choropleth(geo_df1, geojson = counties, locations = 'FIPS', color = 'mean_population', scope='usa',
                    color_continuous_scale="Viridis", hover_data = ['Counties'],
                    labels={'mean_population':'Population'})
fig.update_layout(margin={"r":0,"t":0,"l":0,"b":0})
fig.update_geos(fitbounds="locations", visible=False)
fig.show()
In [35]:
geo_df2 = wildfire[['Counties', '6_temp', '7_temp', '8_temp', '9_temp', '10_temp']]
geo_df2['avgtemp'] = geo_df2.loc[:, '6_temp':'10_temp'].mean(axis=1)
geo_df2 = geo_df2.groupby('Counties').agg(np.mean).reset_index().merge(fips_to_ca, left_on='Counties', right_on = 'CTYNAME').drop('CTYNAME', axis = 1)


fig = px.choropleth(geo_df2, geojson = counties, locations = 'FIPS', color = 'avgtemp', scope='usa',
                    color_continuous_scale="Viridis", hover_data = ['Counties'],
                    labels={'avgtemp':'Average summer temperature across years'})
fig.update_layout(margin={"r":0,"t":0,"l":0,"b":0})
fig.update_geos(fitbounds="locations", visible=False)
fig.show()
In [36]:
geo_df3 = wildfire[['Counties', '6_wind', '7_wind', '8_wind', '9_wind', '10_wind']]
geo_df3['avgwind'] = geo_df3.loc[:, '6_wind':'10_wind'].mean(axis=1)
geo_df3 = geo_df3.groupby('Counties').agg(np.mean).reset_index().merge(fips_to_ca, left_on='Counties', right_on = 'CTYNAME').drop('CTYNAME', axis = 1)


fig = px.choropleth(geo_df3, geojson = counties, locations = 'FIPS', color = 'avgwind', scope='usa',
                    color_continuous_scale="Viridis", hover_data = ['Counties'],
                    labels={'avgwind':'Average summer wind across years'})
fig.update_layout(margin={"r":0,"t":0,"l":0,"b":0})
fig.update_geos(fitbounds="locations", visible=False)
fig.show()
In [37]:
geo_df4 = wildfire[['Counties', '6_rh', '7_rh', '8_rh', '9_rh', '10_rh']]
geo_df4['avgrh'] = geo_df4.loc[:, '6_rh':'10_rh'].mean(axis=1)
geo_df4 = geo_df4.groupby('Counties').agg(np.mean).reset_index().merge(fips_to_ca, left_on='Counties', right_on = 'CTYNAME').drop('CTYNAME', axis = 1)


fig = px.choropleth(geo_df4, geojson = counties, locations = 'FIPS', color = 'avgrh', scope='usa',
                    color_continuous_scale="Viridis", hover_data = ['Counties'],
                    labels={'avgrh':'Average Relative Humidity across months and years'})
fig.update_layout(margin={"r":0,"t":0,"l":0,"b":0})
fig.update_geos(fitbounds="locations", visible=False)
fig.show()

Model¶

In [38]:
def severity_class(acres_burned):
# Class A - one-fourth acre or less;
# Class B - more than one-fourth acre, but less than 10 acres;
# Class C - 10 acres or more, but less than 100 acres;
# Class D - 100 acres or more, but less than 300 acres;
# Class E - 300 acres or more, but less than 1,000 acres;
# Class F - 1,000 acres or more, but less than 5,000 acres;
# Class G - 5,000 acres or more.
  if acres_burned <= 0.25:
    return "A"
  elif acres_burned < 10:
    return "B"
  elif acres_burned < 100:
    return "C"
  elif acres_burned < 300:
    return "D"
  elif acres_burned < 1000:
    return "E"
  elif acres_burned < 5000:
    return "F"
  else:
    return "G"
In [39]:
wildfire['Class'] = wildfire['AcresBurned'].apply(severity_class)
In [40]:
num_classA = len(wildfire[wildfire['Class'] == 'A'])
num_classB = len(wildfire[wildfire['Class'] == 'B'])
print('Number of Class A and Class B fires: ' + str(num_classA + num_classB))
Number of Class A and Class B fires: 27
In [41]:
wildfire = wildfire[(wildfire['Class'] != 'A') & (wildfire['Class'] != 'B')] #dropping class A and B fires
wildfire['NumClass'] = wildfire['Class'].replace({'C': 0, 'D': 1, 'E': 2, 'F': 3, 'G': 4, 'A': 5, 'B': 6})
wildfire['NumClass']
Out[41]:
54      1
70      1
120     0
139     0
298     3
       ..
949     0
1037    0
1490    0
1491    0
1494    0
Name: NumClass, Length: 1550, dtype: int64
In [42]:
# counts of the classes (before combining Class A and B fires)
class_counts = wildfire.groupby('Class').count()
class_counts_dict = class_counts.to_dict()['AcresBurned']
plt.bar(class_counts_dict.keys(), class_counts_dict.values(), edgecolor='black');
plt.title('2013-2020 Wildfire Class Counts');
plt.xlabel('Class');
plt.ylabel('# Wildfires');
In [43]:
temp_wildfire = wildfire[['StartedDate', 'ExtinguishedDate']]
temp_wildfire['BurnDuration'] = (wildfire['ExtinguishedDate'] - wildfire['StartedDate']) #timestamp object
temp_wildfire['BurnDuration'] = [timestamp.days for timestamp in temp_wildfire['BurnDuration']] #converting to int # days
In [44]:
temp_wildfire['StartedDOY'] = temp_wildfire['StartedDate'].apply(lambda x: x.timetuple().tm_yday)
temp_wildfire['ExtinguishedDOY'] = temp_wildfire['ExtinguishedDate'].apply(lambda x: x.timetuple().tm_yday)
temp_wildfire.drop(['StartedDate', 'ExtinguishedDate'],axis=1, inplace=True)
In [45]:
X = wildfire.drop(['StartedDate', 'ExtinguishedDate', 'AcresBurned', 'Class', 'NumClass', 'Counties'], axis=1)
X['StartedDOY'] = temp_wildfire['StartedDOY']
X['ExtinguishedDOY'] = temp_wildfire['ExtinguishedDOY']
X['BurnDuration'] = temp_wildfire['BurnDuration']
y = wildfire[['NumClass']]
train_X, test_X, train_y, test_y =  train_test_split(X, y, test_size=.30, random_state=42, stratify=wildfire['NumClass'])
In [46]:
clf = RandomForestClassifier(max_depth=8, random_state = 42)
clf.fit(train_X, train_y)
Out[46]:
RandomForestClassifier(max_depth=8, random_state=42)
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
RandomForestClassifier(max_depth=8, random_state=42)
In [47]:
num_correct_train = sum(clf.predict(train_X) == train_y['NumClass'].values)
num_predictions_train = len(train_y)

print("Model Train Accuracy: " + str(num_correct_train / num_predictions_train))
Model Train Accuracy: 0.7576036866359447
In [48]:
clf_pred_y = clf.predict(test_X)
num_correct_test = sum(clf_pred_y == test_y['NumClass'].values)
num_predictions_test = len(test_y)
In [49]:
print("Model Test Accuracy: " + str(num_correct_test / num_predictions_test))
Model Test Accuracy: 0.5010752688172043
In [50]:
len(train_y[train_y['NumClass'] == 0]['NumClass']) / len(train_y) #prediction accuracy if we predicted majority class only
Out[50]:
0.4875576036866359

Model Visualizations¶

In [51]:
y = clf.feature_importances_

fig, ax = plt.subplots()
width = 0.4 # the width of the bars
ind = np.arange(len(y)) # the x locations for the groups
ax.barh(ind, y, width, color='green')
ax.set_yticks(ind+width/10)
ax.set_yticklabels(X.columns, minor=False)
plt.title('Feature importance in RandomForest Classifier')
plt.xlabel('Relative importance')
plt.ylabel('feature')
plt.figure(figsize=(20,20))
fig.set_size_inches(10, 10, forward=True)
<Figure size 1440x1440 with 0 Axes>
In [52]:
test_results = test_y.copy()
test_results = test_results.rename(columns={'NumClass': 'True'})
test_results['Predicted'] = clf_pred_y
test_results['counter'] = 1
test_results.head()
Out[52]:
True Predicted counter
203 2 0 1
1434 1 0 1
105 0 0 1
72 1 0 1
655 4 2 1
In [53]:
results_pivot = test_results.pivot_table(index='True', columns='Predicted', values='counter', aggfunc='count')
results_pivot = results_pivot.reset_index()
results_pivot['True'] = results_pivot['True'].replace({0:'C', 1:'D', 2:'E', 3:'F', 4:'G'})
results_pivot
Out[53]:
Predicted True 0 1 2 3 4
0 C 197.0 16.0 5.0 4.0 5.0
1 D 69.0 20.0 3.0 3.0 4.0
2 E 37.0 9.0 2.0 2.0 NaN
3 F 27.0 7.0 1.0 6.0 7.0
4 G 19.0 3.0 5.0 6.0 8.0
In [54]:
results_pivot.plot(x='True', kind='bar', stacked=True,
        title='Predicted and True Wildfire Clases')
plt.xlabel('True Class');
plt.ylabel('Count');
plt.legend(labels=['C', 'D', 'E', 'F', 'G'], title='Predicted Class');
plt.xticks(rotation=0);
In [55]:
test_results['TrueClass'] = test_results['True'].replace({0:'C', 1:'D', 2:'E', 3:'F', 4:'G'})
test_results['CorrectPrediction'] = test_results['True'] == test_results['Predicted']
In [56]:
correct_pred = test_results[['TrueClass', 'CorrectPrediction']].groupby('TrueClass').agg({"CorrectPrediction": [np.sum, np.ma.count]})
correct_pred['PropCorrect'] = correct_pred['CorrectPrediction']['sum'] / correct_pred['CorrectPrediction']['count']
In [57]:
correct_pred.reset_index().plot('TrueClass', 'PropCorrect', color='maroon', kind='bar',legend=False, title='Correct Prediction Proportions, By Class');
plt.xlabel('Class');
plt.ylabel('Proportion');
plt.xticks(rotation=0);
In [58]:
correct_pred.reset_index()[['TrueClass', 'PropCorrect']]
Out[58]:
TrueClass PropCorrect
0 C 0.867841
1 D 0.202020
2 E 0.040000
3 F 0.125000
4 G 0.195122